example-patentsview_inventors.RmdBuilt with R 4.2.1
This example explores the PatentsView bulk tables, with a focus on assigned inventor sex.
Before starting, we’ll need to load the package, and point to a directory where we’d like things saved:
# install if needed: remotes::install_guthub("uva-bi-sdad/uspto")
library(uspto)
outDir <- "../patentsview/"The first step toward answering this question is to assign a sex to inventors based on their given name. The inventors table includes USPTO’s assignment, which we’ll start with:
# you may need to increase your download timeout, depending on your connection
options(timeout = 300)
inventors <- as.data.frame(download_patentsview_bulk("inventor", outDir))Now, we can add a few of our own prediction methods, including one based on the included USPTO flags:
# add a standardized version of given names
inventors$name_first[is.na(inventors$name_first)] <- ""
inventors$given <- sub(
"^(.)", "\\U\\1", gsub("^([a-z-]{,6}[.-])+ | +.*", "", tolower(inventors$name_first)),
perl = TRUE
)
# associate a sex with each unique given name
## install if needed: remotes::install_github("miserman/lusilab")
library(lusilab)
inventor_sex <- predict_demographics(unique(inventors$given), dir = paste0(dirname(outDir), "/names"))
inventor_sex <- inventor_sex[!duplicated(inventor_sex$given), ]
# get categorical predictions
prob_cols <- grep("^prob_", colnames(inventor_sex), value = TRUE)
inventor_sex_preds <- inventor_sex[, prob_cols]
dimnames(inventor_sex_preds) <- list(
inventor_sex$given,
sub("prob", "pred", prob_cols, fixed = TRUE)
)
inventor_sex_preds[inventor_sex_preds > .5] <- 1
inventor_sex_preds[inventor_sex_preds < .5] <- 0Before adding this to the inventors data, we can compare these different methods with the USPTO assignments:
# start with unique, assignment-processed inventors
unique_inventors <- inventors[inventors$attribution_status != 98, ]
unique_inventors <- unique_inventors[
!duplicated(unique_inventors$id) & unique_inventors$given %in% rownames(inventor_sex_preds),
c("given", "male_flag")
]
# reverse the male flag, and add the other predictions
unique_inventors$male_flag[is.na(unique_inventors$male_flag)] <- .5
unique_inventors$pred_fem_patentsview <- 1 - unique_inventors$male_flag
unique_inventors <- cbind(unique_inventors[, c(1, 3)], inventor_sex_preds[unique_inventors$given, ])
kable(
data.frame(
"Proportion Sexed" = colMeans(unique_inventors[, -1] != .5),
"Accuracy to PatentsView" = colMeans(unique_inventors[, -1] == unique_inventors[, 2]),
check.names = FALSE
),
caption = "Sex Predition Method Comparisons"
)| Proportion Sexed | Accuracy to PatentsView | |
|---|---|---|
| pred_fem_patentsview | 0.9243208 | 1.0000000 |
| pred_fem_wgnd | 0.8962996 | 0.8513556 |
| pred_fem_fb | 0.9001380 | 0.8402242 |
| pred_fem_fb_scraped | 0.7359894 | 0.7377783 |
| pred_fem_skydeck | 0.7991721 | 0.7862536 |
| pred_fem_usssa | 0.7812736 | 0.7743813 |
| pred_fem_uspto | 0.9944673 | 0.8715289 |
Now we can move forward with the method with most coverage:
# add assigned sex to inventors data
inventors$pred_fem <- inventor_sex_preds[inventors$given, "pred_fem_uspto"]
## focus on just sexed inventors
inventors <- inventors[!is.na(inventors$pred_fem) & inventors$pred_fem != .5, ]
rownames(inventors) <- inventors$id
# add patent IDs
patent_inventor <- download_patentsview_bulk("patent_inventor", outDir)
patent_inventor <- patent_inventor[patent_inventor$inventor_id %in% inventors$id, ]
inventors <- cbind(inventors[patent_inventor$inventor_id, -1], patent_inventor)
# count inventors predicted to be female in each patent
female_inventors <- tapply(inventors$pred_fem == 1, inventors$patent_id, sum)Now we can add patent category information, to get breakdowns of classes by inventor sex. There are multiple classification schemes, but we can start with the World Intellectual Property Organization (WIPO) technology fields for a high-level overview:
library(Matrix)
# this makes a patent x WIPO category matrix
categories_wipo <- patentsview_class_matrix("wipo", paste0(outDir, "wipo_matrix.rds"), dir = outDir)
dim(categories_wipo)
#> [1] 7221215 35
# which we can join to the inventors matrix
inventors_wipo <- female_inventors[names(female_inventors) %in% rownames(categories_wipo)]
inventors_wipo <- cbind(as.numeric(inventors_wipo), categories_wipo[names(inventors_wipo), ])
# and get category breakdowns by inventor sex
wipo_summary <- data.frame(
Any_Female = colSums(inventors_wipo[inventors_wipo[, 1] != 0, -1] != 0),
No_Female = colSums(inventors_wipo[inventors_wipo[, 1] == 0, -1] != 0)
)
wipo_summary$Any_Female_Proportion <- wipo_summary$Any_Female / rowSums(wipo_summary)
wipo_summary <- wipo_summary[order(-wipo_summary$Any_Female_Proportion), ]
# add category titles
wipo_field <- as.data.frame(download_patentsview_bulk("wipo_field", outDir))
rownames(wipo_field) <- wipo_field$id
wipo_summary$Title <- wipo_field[rownames(wipo_summary), "field_title"]
kable(
wipo_summary,
col.names = gsub("_", " ", colnames(wipo_summary), fixed = TRUE),
caption = "World Intellectual Property Organization Categories"
)| Any Female | No Female | Any Female Proportion | Title | |
|---|---|---|---|---|
| 15 | 95837 | 119325 | 0.4454179 | Biotechnology |
| 16 | 130366 | 178793 | 0.4216795 | Pharmaceuticals |
| 11 | 29377 | 53264 | 0.3554773 | Analysis of biological materials |
| 14 | 103048 | 215153 | 0.3238456 | Organic fine chemistry |
| 18 | 17566 | 49561 | 0.2616831 | Food chemistry |
| 19 | 63981 | 183795 | 0.2582211 | Basic materials chemistry |
| 7 | 42753 | 127448 | 0.2511912 | IT methods for management |
| 22 | 15148 | 47121 | 0.2432671 | Micro-structural and nano-technology |
| 17 | 47131 | 149718 | 0.2394272 | Macromolecular chemistry, polymers |
| 4 | 123140 | 459269 | 0.2114322 | Digital communication |
| 8 | 95285 | 385653 | 0.1981233 | Semiconductors |
| 6 | 195144 | 793346 | 0.1974163 | Computer technology |
| 13 | 92372 | 394940 | 0.1895541 | Medical technology |
| 21 | 32318 | 145255 | 0.1819984 | Surface technology, coating |
| 20 | 30840 | 142421 | 0.1779974 | Materials, metallurgy |
| 34 | 33974 | 157637 | 0.1773071 | Other consumer goods |
| 3 | 65270 | 322662 | 0.1682511 | Telecommunications |
| 9 | 74222 | 382155 | 0.1626331 | Optics |
| 23 | 42707 | 224038 | 0.1601042 | Chemical engineering |
| 2 | 84874 | 462135 | 0.1551602 | Audio-visual technology |
| 28 | 29643 | 163823 | 0.1532207 | Textile and paper machines |
| 12 | 35854 | 201649 | 0.1509623 | Control |
| 24 | 17007 | 107006 | 0.1371388 | Environmental technology |
| 10 | 69791 | 439170 | 0.1371245 | Measurement |
| 33 | 31347 | 198811 | 0.1361977 | Furniture, games |
| 1 | 83984 | 535169 | 0.1356434 | Electrical machinery, apparatus, energy |
| 29 | 39767 | 286568 | 0.1218594 | Other special machines |
| 5 | 21258 | 160036 | 0.1172571 | Basic communication processes |
| 30 | 11648 | 109139 | 0.0964342 | Thermal processes and apparatus |
| 32 | 40029 | 382258 | 0.0947910 | Transport |
| 25 | 21193 | 212107 | 0.0908401 | Handling |
| 27 | 23310 | 238994 | 0.0888664 | Engines, pumps, turbines |
| 26 | 18191 | 203697 | 0.0819828 | Machine tools |
| 35 | 20622 | 238441 | 0.0796023 | Civil engineering |
| 31 | 21081 | 275514 | 0.0710767 | Mechanical elements |
For a more refined classification, we might look at the United States Patent Classification (USPC), which is most closely related to examination process:
# get the patent x USPC category matrix
categories_uspc <- patentsview_class_matrix(
"uspc_current", paste0(outDir, "uspc_current_matrix.rds"),
dir = outDir
)
dim(categories_uspc)
#> [1] 6597925 475
# join to the inventors matrix
inventors_uspc <- female_inventors[names(female_inventors) %in% rownames(categories_uspc)]
inventors_uspc <- cbind(as.numeric(inventors_uspc), categories_uspc[names(inventors_uspc), ])
# and get category breakdowns by inventor sex
uspc_summary <- data.frame(
Any_Female = colSums(inventors_uspc[inventors_uspc[, 1] != 0, -1] != 0),
No_Female = colSums(inventors_uspc[inventors_uspc[, 1] == 0, -1] != 0)
)
uspc_summary$Any_Female_Proportion <- uspc_summary$Any_Female / rowSums(uspc_summary)
uspc_summary <- uspc_summary[order(-uspc_summary$Any_Female_Proportion), ]
# add category titles
uspc_field <- as.data.frame(download_patentsview_bulk("mainclass_current", outDir))
rownames(uspc_field) <- uspc_field$id
uspc_summary$Title <- uspc_field[rownames(uspc_summary), "title"]
kable(
uspc_summary[c(1:20, 1:20 + nrow(uspc_summary) - 20), ],
col.names = gsub("_", " ", colnames(uspc_summary), fixed = TRUE),
caption = paste(
"U.S. Patent Classification categories with the highest and lowest",
"proportion of any female inventor"
)
)| Any Female | No Female | Any Female Proportion | Title | |
|---|---|---|---|---|
| 450 | 865 | 619 | 0.5828841 | FOUNDATION GARMENTS |
| 532 | 9 | 11 | 0.4500000 | ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES |
| 530 | 22685 | 31356 | 0.4197739 | CHEMISTRY: NATURAL RESINS OR DERIVATIVES; PEPTIDES OR PROTEINS; LIGNINS OR REACTION PRODUCTS THEREOF |
| 536 | 23999 | 34275 | 0.4118303 | ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES |
| D05 | 1040 | 1520 | 0.4062500 | TEXTILE OR PAPER YARD GOODS; SHEET MATERIAL |
| 520 | 55 | 81 | 0.4044118 | SYNTHETIC RESINS OR NATURAL RUBBERS – PART OF THE CLASS 520 SERIES |
| 435 | 50207 | 80953 | 0.3827920 | CHEMISTRY: MOLECULAR BIOLOGY AND MICROBIOLOGY |
| 424 | 37241 | 64363 | 0.3665308 | DRUG, BIO-AFFECTING AND BODY TREATING COMPOSITIONS |
| 514 | 53199 | 97628 | 0.3527154 | DRUG, BIO-AFFECTING AND BODY TREATING COMPOSITIONS |
| 544 | 13827 | 27792 | 0.3322281 | ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES |
| D01 | 815 | 1657 | 0.3296926 | EDIBLE PRODUCTS |
| 546 | 14971 | 30560 | 0.3288089 | ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES |
| 506 | 899 | 1852 | 0.3267903 | COMBINATORIAL CHEMISTRY TECHNOLOGY: METHOD, LIBRARY, APPARATUS |
| 800 | 5307 | 11200 | 0.3215000 | MULTICELLULAR LIVING ORGANISMS AND UNMODIFIED PARTS THEREOF AND RELATED PROCESSES |
| 510 | 6014 | 12777 | 0.3200468 | CLEANING COMPOSITIONS FOR SOLID SURFACES, AUXILIARY COMPOSITIONS THEREFOR, OR PROCESSES OF PREPARING THE COMPOSITIONS |
| 548 | 12885 | 29900 | 0.3011569 | ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES |
| 132 | 2091 | 4869 | 0.3004310 | TOILET |
| D28 | 2973 | 6924 | 0.3003941 | COSMETIC PRODUCTS AND TOILET ARTICLES |
| 147 | 8 | 19 | 0.2962963 | COOPERING |
| D02 | 6847 | 16458 | 0.2937996 | APPAREL AND HABERDASHERY |
| 185 | 18 | 447 | 0.0387097 | MOTORS: SPRING, WEIGHT, OR ANIMAL POWERED |
| 406 | 137 | 3416 | 0.0385590 | CONVEYORS: FLUID CURRENT |
| 245 | 9 | 227 | 0.0381356 | WIRE FABRICS AND STRUCTURE |
| 105 | 188 | 4765 | 0.0379568 | RAILWAY ROLLING STOCK |
| 305 | 64 | 1628 | 0.0378251 | WHEEL SUBSTITUTES FOR LAND VEHICLES |
| 254 | 331 | 8469 | 0.0376136 | IMPLEMENTS OR APPARATUS FOR APPLYING PUSHING OR PULLING FORCE |
| 144 | 234 | 6007 | 0.0374940 | WOODWORKING |
| 408 | 307 | 7944 | 0.0372076 | CUTTING BY USE OF ROTATING AXIALLY MOVING TOOL |
| 984 | 124 | 3226 | 0.0370149 | MUSICAL INSTRUMENTS |
| 226 | 189 | 5009 | 0.0363601 | ADVANCING MATERIAL OF INDETERMINATE LENGTH |
| 470 | 43 | 1142 | 0.0362869 | THREADED, HEADED FASTENER, OR WASHER MAKING: PROCESS AND APPARATUS |
| 198 | 865 | 23188 | 0.0359623 | CONVEYORS: POWER-DRIVEN |
| 299 | 158 | 4347 | 0.0350721 | MINING OR IN SITU DISINTEGRATION OF HARD MATERIAL |
| 171 | 18 | 511 | 0.0340265 | UNEARTHING PLANTS OR BURIED OBJECTS |
| 42 | 244 | 6945 | 0.0339407 | FIREARMS |
| 82 | 135 | 4062 | 0.0321658 | TURNING |
| 413 | 27 | 818 | 0.0319527 | SHEET METAL CONTAINER MAKING |
| 295 | 6 | 195 | 0.0298507 | RAILWAY WHEELS AND AXLES |
| 212 | 86 | 3028 | 0.0276172 | TRAVERSING HOISTS |
| 258 | 0 | 16 | 0.0000000 | RAILWAY MAIL DELIVERY |
To get at this question, we’ll need to associate a date with each patent number:
# start with all patent information
patents <- download_patentsview_bulk("patent", outDir)
# associate a date with each patent number
all_dates <- structure(patents$date, names = patents$number)
# get years, and focus on only those found in the USPC set
uspc_year <- substr(all_dates[names(all_dates) %in% rownames(inventors_uspc)], 1, 4)
# use this set of years to break down the overall summaries
uspc_yearly_summaries <- do.call(rbind, lapply(sort(unique(uspc_year)), function(year) {
d <- inventors_uspc[names(which(uspc_year == year)), ]
r <- data.frame(
Any_Female = colSums(d[d[, 1] != 0, -1] != 0, na.rm = TRUE),
No_Female = colSums(d[d[, 1] == 0, -1] != 0, na.rm = TRUE)
)
structure(c(as.numeric(year), r$Any_Female / rowSums(r)), names = c("Year", rownames(r)))
}))
uspc_yearly_summaries[is.na(uspc_yearly_summaries)] <- 0
uspc_yearly_summaries <- uspc_yearly_summaries[, colSums(uspc_yearly_summaries) != 0]
# plot categories with the most positive and negative trends
library(splot)
trends <- sort(cor(uspc_yearly_summaries[, -1], uspc_yearly_summaries[, 1])[, 1], TRUE)
splot(
uspc_yearly_summaries[, names(trends[c(1:6, 1:3 + length(trends) - 3)])] ~ uspc_yearly_summaries[, 1],
lines = "spline", levels = list(mv = uspc_field[names(trends[c(1:6, 1:3 + length(trends) - 3)]), "title"]),
title = "Proportion of Female-Assigned Inventors over Time",
laby = "Proportion of Patents With Any Female-Assigned Inventor",
labx = "Year", myl = c(0, .4), leg.title = "U.S. Patent Classification"
)Cooperative Patent Classifications (CPC) might give an even more refined look, and could be compared across patent offices:
# get the patent x CPC category matrix
categories_cpc <- patentsview_class_matrix(
"cpc_current", paste0(outDir, "cpc_current_matrix.rds"),
dir = outDir
)
dim(categories_cpc)
#> [1] 7228678 670
# join to the inventors matrix
inventors_cpc <- female_inventors[names(female_inventors) %in% rownames(categories_cpc)]
inventors_cpc <- cbind(as.numeric(inventors_cpc), categories_cpc[names(inventors_cpc), ])
# and get category breakdowns by inventor sex
cpc_summary <- data.frame(
Any_Female = colSums(inventors_cpc[inventors_cpc[, 1] != 0, -1] != 0),
No_Female = colSums(inventors_cpc[inventors_cpc[, 1] == 0, -1] != 0)
)
cpc_summary$Any_Female_Proportion <- cpc_summary$Any_Female / rowSums(cpc_summary)
cpc_summary <- cpc_summary[order(-cpc_summary$Any_Female_Proportion), ]
# add category titles
cpc_group <- as.data.frame(download_patentsview_bulk("cpc_group", outDir))
rownames(cpc_group) <- cpc_group$id
cpc_summary <- cpc_summary[rownames(cpc_summary) %in% cpc_group$id, ]
cpc_summary$Title <- cpc_group[rownames(cpc_summary), "title"]
cpc_selection <- names(which(rowSums(cpc_summary[, 1:2]) > 5))
cpc_selection <- cpc_selection[c(1:20, 1:20 + length(cpc_selection) - 20)]
kable(
cpc_summary[cpc_selection, ],
col.names = gsub("_", " ", colnames(cpc_summary), fixed = TRUE),
caption = paste(
"Cooperative Patent Classifications: 20 categories with the highest and lowest",
"proportion of any female inventor with at least 5 associated patents"
)
)| Any Female | No Female | Any Female Proportion | Title | |
|---|---|---|---|---|
| A41C | 1080 | 624 | 0.6338028 | CORSETS; BRASSIERES |
| C12Y | 9228 | 8665 | 0.5157324 | ENZYMES |
| A23Y | 407 | 384 | 0.5145386 | INDEXING SCHEME RELATING TO LACTIC OR PROPIONIC ACID BACTERIA USED IN FOODSTUFFS OR FOOD PREPARATION |
| B42P | 449 | 449 | 0.5000000 | INDEXING SCHEME RELATING TO BOOKS, FILING APPLIANCES OR THE LIKE |
| C07K | 50697 | 55282 | 0.4783684 | PEPTIDES |
| C12N | 49212 | 55853 | 0.4683958 | MICROORGANISMS OR ENZYMES; COMPOSITIONS THEREOF; PROPAGATING, PRESERVING, OR MAINTAINING MICROORGANISMS; MUTATION OR GENETIC ENGINEERING; CULTURE MEDIA |
| A61P | 97848 | 118156 | 0.4529916 | SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS |
| C12R | 2959 | 3605 | 0.4507922 | INDEXING SCHEME ASSOCIATED WITH SUBCLASSES C12C - C12Q, RELATING TO MICROORGANISMS |
| C12P | 11386 | 14667 | 0.4370322 | FERMENTATION OR ENZYME-USING PROCESSES TO SYNTHESISE A DESIRED CHEMICAL COMPOUND OR COMPOSITION OR TO SEPARATE OPTICAL ISOMERS FROM A RACEMIC MIXTURE |
| A41B | 1246 | 1610 | 0.4362745 | SHIRTS; UNDERWEAR; BABY LINEN; HANDKERCHIEFS |
| A61Q | 15280 | 20272 | 0.4297930 | SPECIFIC USE OF COSMETICS OR SIMILAR TOILET PREPARATIONS |
| C12Q | 21156 | 28804 | 0.4234588 | MEASURING OR TESTING PROCESSES INVOLVING ENZYMES, NUCLEIC ACIDS OR MICROORGANISMS ; COMPOSITIONS OR TEST PAPERS THEREFOR; PROCESSES OF PREPARING SUCH COMPOSITIONS; CONDITION-RESPONSIVE CONTROL IN MICROBIOLOGICAL OR ENZYMOLOGICAL PROCESSES |
| A61K | 109600 | 152294 | 0.4184899 | PREPARATIONS FOR MEDICAL, DENTAL, OR TOILET PURPOSES |
| A23V | 4069 | 5723 | 0.4155433 | INDEXING SCHEME RELATING TO FOODS, FOODSTUFFS OR NON-ALCOHOLIC BEVERAGES |
| A01P | 10 | 15 | 0.4000000 | BIOCIDAL, PEST REPELLANT, PEST ATTRACTANT OR PLANT GROWTH REGULATORY ACTIVITY OF CHEMICAL COMPOUNDS OR PREPARATIONS |
| A47D | 1661 | 2571 | 0.3924858 | FURNITURE SPECIALLY ADAPTED FOR CHILDREN |
| G16B | 2223 | 3473 | 0.3902739 | BIOINFORMATICS, i.e. INFORMATION AND COMMUNICATION TECHNOLOGY [ICT] SPECIALLY ADAPTED FOR GENETIC OR PROTEIN-RELATED DATA PROCESSING IN COMPUTATIONAL MOLECULAR BIOLOGY |
| C07B | 4106 | 6922 | 0.3723250 | GENERAL METHODS OF ORGANIC CHEMISTRY; APPARATUS THEREFOR |
| A21D | 1377 | 2357 | 0.3687734 | TREATMENT, e.g. PRESERVATION, OF FLOUR OR DOUGH, e.g. BY ADDITION OF MATERIALS; BAKING; BAKERY PRODUCTS; PRESERVATION THEREOF |
| C11D | 8140 | 14651 | 0.3571585 | DETERGENT COMPOSITIONS ; USE OF SINGLE SUBSTANCES AS DETERGENTS; SOAP OR SOAP-MAKING; RESIN SOAPS; RECOVERY OF GLYCEROL |
| B66C | 343 | 7818 | 0.0420292 | CRANES; LOAD-ENGAGING ELEMENTS OR DEVICES FOR CRANES, CAPSTANS, WINCHES, OR TACKLES |
| B27B | 193 | 4508 | 0.0410551 | SAWS FOR WOOD OR SIMILAR MATERIAL; COMPONENTS OR ACCESSORIES THEREFOR |
| A01F | 173 | 4077 | 0.0407059 | PROCESSING OF HARVESTED PRODUCE; HAY OR STRAW PRESSES; DEVICES FOR STORING AGRICULTURAL OR HORTICULTURAL PRODUCE |
| B61J | 6 | 143 | 0.0402685 | SHIFTING OR SHUNTING OF RAIL VEHICLES |
| D03C | 36 | 871 | 0.0396913 | SHEDDING MECHANISMS; PATTERN CARDS OR CHAINS; PUNCHING OF CARDS; DESIGNING PATTERNS |
| F41A | 461 | 11186 | 0.0395810 | FUNCTIONAL FEATURES OR DETAILS COMMON TO BOTH SMALLARMS AND ORDNANCE, e.g. CANNONS; MOUNTINGS FOR SMALLARMS OR ORDNANCE |
| G10B | 3 | 75 | 0.0384615 | ORGANS, HARMONIUMS OR SIMILAR WIND MUSICAL INSTRUMENTS WITH ASSOCIATED BLOWING APPARATUS |
| E01B | 129 | 3255 | 0.0381206 | PERMANENT WAY; PERMANENT-WAY TOOLS; MACHINES FOR MAKING RAILWAYS OF ALL KINDS |
| D03J | 18 | 481 | 0.0360721 | AUXILIARY WEAVING APPARATUS; WEAVERS’ TOOLS; SHUTTLES |
| D01H | 131 | 3665 | 0.0345100 | SPINNING OR TWISTING |
| B62C | 2 | 61 | 0.0317460 | VEHICLES DRAWN BY ANIMALS |
| B27F | 41 | 1322 | 0.0300807 | DOVETAILED WORK; TENONS; SLOTTING MACHINES FOR WOOD OR SIMILAR MATERIAL; NAILING OR STAPLING MACHINES |
| B21G | 10 | 326 | 0.0297619 | MAKING NEEDLES, PINS OR NAILS OF METAL |
| B27L | 45 | 1481 | 0.0294889 | REMOVING BARK OR VESTIGES OF BRANCHES ; SPLITTING WOOD; MANUFACTURE OF VENEER, WOODEN STICKS, WOOD SHAVINGS, WOOD FIBRES OR WOOD POWDER |
| F16T | 12 | 400 | 0.0291262 | STEAM TRAPS OR LIKE APPARATUS FOR DRAINING-OFF LIQUIDS FROM ENCLOSURES PREDOMINANTLY CONTAINING GASES OR VAPOURS |
| D02H | 7 | 254 | 0.0268199 | WARPING, BEAMING OR LEASING |
| F27M | 8 | 295 | 0.0264026 | INDEXING SCHEME RELATING TO ASPECTS OF THE CHARGES OR FURNACES, KILNS, OVENS OR RETORTS |
| B41G | 1 | 52 | 0.0188679 | APPARATUS FOR BRONZE PRINTING, LINE PRINTING, OR FOR BORDERING OR EDGING SHEETS OR LIKE ARTICLES; AUXILIARY FOR PERFORATING IN CONJUNCTION WITH PRINTING |
| G06D | 0 | 9 | 0.0000000 | DIGITAL FLUID-PRESSURE COMPUTING DEVICES |
| G21J | 0 | 20 | 0.0000000 | NUCLEAR EXPLOSIVES; APPLICATIONS THEREOF |
The locations table associates inventor IDs with location IDs:
locations <- as.data.frame(download_patentsview_bulk("location", outDir))
locations$state_fips[!is.na(locations$state_fips)] <- formatC(
locations$state_fips[!is.na(locations$state_fips)],
width = 2, flag = 0, format = "d"
)
locations$county_fips[!is.na(locations$county_fips)] <- formatC(
locations$county_fips[!is.na(locations$county_fips)],
width = 5, flag = 0, format = "d"
)
# we can align this with our inventors data for inventor sex
rownames(locations) <- locations$id
located_inventors <- inventors[!is.na(inventors$location_id), ]
locations <- locations[located_inventors$location_id, ]Inventor location is recorded as part of each patent, which means inventors may have multiple locations over time. For an initial look, we can focus only on each inventors most recent location:
# add date information to inventor data
inventors$date <- structure(patents$date, names = patents$number)[inventors$patent_id]
# sort inventors by date, then add location information
inventors_last_seen <- inventors[order(inventors$date, decreasing = TRUE), ]
inventors_last_seen <- inventors_last_seen[!duplicated(inventors_last_seen$inventor_id), ]
inventors_last_seen <- inventors_last_seen[inventors_last_seen$location_id %in% locations$id, ]
inventors_last_seen <- cbind(inventors_last_seen, locations[inventors_last_seen$location_id, ])Now we can look at high-level summaries of locations, like we did with patent classes:
# top countries
breakdown_countries <- as.data.frame(t(vapply(
split(inventors_last_seen$pred_fem, inventors_last_seen$country),
function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
c(0, 0)
)))
breakdown_countries$Proportion_Female <- breakdown_countries$Female / rowSums(breakdown_countries)
breakdown_countries <- breakdown_countries[order(-breakdown_countries$Proportion_Female), ]
kable(
breakdown_countries[rowSums(breakdown_countries[, 1:2]) > 1e4, ],
col.names = gsub("_", " ", colnames(breakdown_countries), fixed = TRUE),
caption = "Countries with at least 10,000 associated inventors"
)| Female | Male | Proportion Female | |
|---|---|---|---|
| CN | 30697 | 111127 | 0.2164443 |
| SU | 2868 | 12402 | 0.1878193 |
| ES | 4204 | 18262 | 0.1871272 |
| SG | 2121 | 11255 | 0.1585676 |
| KR | 22265 | 123451 | 0.1527972 |
| IL | 5889 | 34888 | 0.1444196 |
| DK | 2318 | 14376 | 0.1388523 |
| IN | 7962 | 49936 | 0.1375177 |
| FR | 17124 | 113171 | 0.1314248 |
| US | 248082 | 1676877 | 0.1288765 |
| RU | 1767 | 11997 | 0.1283784 |
| BE | 2662 | 18493 | 0.1258331 |
| IT | 6479 | 47636 | 0.1197265 |
| FI | 2414 | 18484 | 0.1155134 |
| AU | 3433 | 26615 | 0.1142505 |
| CA | 11256 | 94016 | 0.1069230 |
| SE | 4272 | 36451 | 0.1049039 |
| GB | 11575 | 100464 | 0.1033122 |
| JP | 47228 | 469031 | 0.0914812 |
| CH | 3774 | 38745 | 0.0887603 |
| NO | 950 | 10112 | 0.0858796 |
| NL | 4008 | 43075 | 0.0851263 |
| DE | 22507 | 269343 | 0.0771184 |
| AT | 1392 | 17985 | 0.0718377 |
| TW | 7228 | 96518 | 0.0696702 |
# top states
breakdown_states <- as.data.frame(t(vapply(
split(inventors_last_seen$pred_fem, inventors_last_seen$state_fips),
function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
c(0, 0)
)))
breakdown_states$Proportion_Female <- breakdown_states$Female / rowSums(breakdown_states)
breakdown_states <- breakdown_states[order(-breakdown_states$Proportion_Female), ]
## install if needed: remotes::install_github("uva-bi-sdad/catchment")
library(catchment)
states <- download_census_shapes(paste0(dirname(outDir), "/maps"))
state_names <- structure(states$NAME, names = states$STATEFP)
breakdown_states <- breakdown_states[rownames(breakdown_states) %in% names(state_names), ]
rownames(breakdown_states) <- state_names[rownames(breakdown_states)]
kable(
breakdown_states[rowSums(breakdown_states[, 1:2]) > 1e4, ],
col.names = gsub("_", " ", colnames(breakdown_states), fixed = TRUE),
caption = "States with at least 10,000 associated inventors"
)| Female | Male | Proportion Female | |
|---|---|---|---|
| Maryland | 5523 | 29584 | 0.1573191 |
| New Jersey | 10743 | 59426 | 0.1531018 |
| New York | 17472 | 98586 | 0.1505454 |
| Georgia | 5209 | 29964 | 0.1480966 |
| Massachusetts | 12748 | 74697 | 0.1457831 |
| North Carolina | 6312 | 39518 | 0.1377264 |
| Missouri | 3234 | 20380 | 0.1369527 |
| Florida | 9040 | 57314 | 0.1362390 |
| California | 53816 | 342693 | 0.1357245 |
| Virginia | 4600 | 29469 | 0.1350201 |
| Minnesota | 6730 | 45965 | 0.1277161 |
| Illinois | 10649 | 74316 | 0.1253340 |
| Colorado | 5045 | 35367 | 0.1248392 |
| Washington | 9572 | 67880 | 0.1235862 |
| Pennsylvania | 9181 | 66069 | 0.1220066 |
| Arizona | 4056 | 29204 | 0.1219483 |
| Tennessee | 2435 | 17556 | 0.1218048 |
| Oregon | 3635 | 26252 | 0.1216248 |
| South Carolina | 1856 | 13823 | 0.1183749 |
| Ohio | 8706 | 65003 | 0.1181131 |
| Kansas | 1409 | 10543 | 0.1178882 |
| Louisiana | 1243 | 9388 | 0.1169222 |
| Texas | 14560 | 110612 | 0.1163199 |
| Kentucky | 1343 | 10214 | 0.1162066 |
| Connecticut | 4106 | 31505 | 0.1153015 |
| Alabama | 1254 | 9627 | 0.1152468 |
| Indiana | 3864 | 30089 | 0.1138044 |
| Wisconsin | 4529 | 35678 | 0.1126421 |
| Michigan | 8819 | 77834 | 0.1017737 |
| Utah | 1927 | 17285 | 0.1003019 |
| Oklahoma | 1231 | 11331 | 0.0979940 |
| Iowa | 1484 | 13770 | 0.0972860 |
| New Hampshire | 1216 | 11505 | 0.0955900 |
# map of counties
library(leaflet)
library(sf)
counties <- st_transform(download_census_shapes(
paste0(dirname(outDir), "/maps"),
entity = "county"
), "WGS84")
counties$NAME <- paste0(counties$NAME, ", ", state_names[counties$STATEFP])
breakdown_counties <- as.data.frame(t(vapply(
split(inventors_last_seen$pred_fem, inventors_last_seen$county_fips),
function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
c(0, 0)
)))
breakdown_counties$Total <- rowSums(breakdown_counties)
breakdown_counties$Proportion_Female <- breakdown_counties$Female / breakdown_counties$Total
breakdown_counties$Capped_Total <- breakdown_counties$Total
breakdown_counties$Capped_Total[breakdown_counties$Capped_Total > 5e4] <- 1e5
breakdown_counties <- breakdown_counties[counties$GEOID, ]
pal <- colorNumeric(
scico::scico(255, direction = -1, palette = "vik"), breakdown_counties$Proportion_Female
)
pal_total <- colorNumeric(
scico::scico(255, direction = -1, palette = "lajolla"), breakdown_counties$Capped_Total
)
leaflet(counties, options = leafletOptions(attributionControl = FALSE)) |>
addProviderTiles("CartoDB.Positron") |>
setView(-95.5810546875, 39.5040407055842, 4) |>
addControl("Proportion of Patents with Any Inventor Assigned Female", "topright") |>
addLayersControl(position = "topleft", overlayGroups = c("Total", "Proportion")) |>
addLegend(
"bottomright", pal_total, breakdown_counties$Capped_Total,
opacity = 1,
title = "Totals", group = "Total"
) |>
addPolygons(
fillColor = pal_total(breakdown_counties$Capped_Total),
fillOpacity = .8, weight = 1, color = "#000", highlightOptions = highlightOptions(color = "#fff"),
group = "Total", label = paste0("County: ", counties$NAME, "; Total: ", breakdown_counties$Total)
) |>
hideGroup("Total") |>
addLegend(
"bottomright", pal, breakdown_counties$Proportion_Female,
opacity = 1,
title = "Proportions", group = "Proportion"
) |>
addPolygons(
fillColor = pal(breakdown_counties$Proportion_Female), fillOpacity = .8, weight = 1, color = "#000",
highlightOptions = highlightOptions(color = "#fff"),
group = "Proportion", label = paste0(
"County: ", counties$NAME,
"; Female: ", breakdown_counties$Female,
"; Male: ", breakdown_counties$Male,
"; Proportion Female: ", round(breakdown_counties$Proportion_Female, 3)
)
)